#delimit;
clear all;
pause on;
set more off;
capture log close;

********************************************************************************;
* This do-file replicates Quinby, Shanker "Wages and the Internet"         *****;
* By Laura Quinby (lquinby@fas.harvard.edu), April 30, 2013                *****;
********************************************************************************;

*The structure of this do-file is as follows:

*Merge CPS large-county identifiers with Forman et al.'s replication dataset;
*The CPS datafile also contains some additional, unused wage variables;
*If you would like the code that created the CPS data, please contact the author;

*Attempt to replicate Forman et al.'s original specification on the large-county sample;
*Note that Forman et al. included a lot of seemingly redundant controls;
*We can not include all of these controls once we reduce the sample size from 3000 to 200.

*Re-specify the model keeping to the spirit of Forman et al. (2012);
                                       
use final_cps_growth.dta; //large county identifiers and additional, unused wage variables

merge 1:1 county using countygrowth.dta; //Forman's replication dataset
keep if _merge==3;
drop _merge;

*Re-run Forman et. al's original specifications on the limited county sample;

global controls lnpop pctblk1990 pctunivp1990 pctHSp1990 pctbelowPL1990 medhhinc1990 carnegie1_enr frac_in_eng_prog npatent1980s frprof pct65p1990 netmig95;
global change change_totalpop change_pctblk change_pctunivp change_pctHSp change_pct65 change_netmig;
*This creates two global variables: controls and change. These will be used to quickly add control variables to the regressions;

*Table 2 of Forman et al. (2012), main specification, large-county sample;
reg wagediff surv_deeppost00 indivhomeinternet00_cty tech95 $controls $change, robust; 

*Table 4 of Forman et al. (2012), Equation 7, large-county sample;
ivregress liml wagediff (surv_deeppost00=iv_othcprogrammerswt iv_num_a) indivhomeinternet00_cty tech95 $controls $change, first vce(robust);

*Note that ARPANET is not a valid instrument, and drop it;
ivregress liml wagediff (surv_deeppost00=iv_othcprogrammerswt) indivhomeinternet00_cty tech95 $controls $change, first vce(robust);

*Interactions with All-High;
*Because the all-high effect is supposed to be relative, re-calibrate;
*This requires some variables that are only in Forman et al.'s countyyear.dta dataset;

merge 1:1 county year using countyyear.dta, keepusing(pctunivp1990 medhhinc1990);
keep if _merge==3;

*Create new all-high dummy;
*Can't use quartiles as before, because not enough observations;
xtile highpop2 = lnpop, nquantiles(3);
xtile highed2 = pctunivp1990, nquantiles(3);
xtile highinc2 = medhhinc1990, nquantiles(3);
xtile highind2 = tech95, nquantiles(3);

g allhigh2 = (highpop2==3 & highed2==3 & highinc2==3 & highind2==3);

*Interactions with all high;
gen allhigh_surv_deeppost00 = surv_deeppost00*allhigh2;
gen ivd_num_a2 =iv_num_a*allhigh2;
gen ivd_othcp2=iv_othcp* allhigh2;

*Robustness checks;

*Check for mis-specification;
reg wagediff surv_deeppost00 allhigh_surv_deeppost00 allhigh2 highpop2 highed2 highinc2 highind2 indivhomeinternet00_cty tech95 $controls $change;
predict yhat;
g resid = wagediff-yhat;
reg wagediff surv_deeppost00 allhigh_surv_deeppost00 allhigh2 highpop2 highed2 highinc2 highind2 indivhomeinternet00_cty tech95 $controls $change, robust;
*The robust and regular standard errors are a bit different, but the difference doesn't seem to be due to misspecification;

/*Look at the main treatments;
twoway scatter resid surv_deeppost00;
*some outliers with high levels of investment and high residuals;
twoway scatter resid allhigh_surv_deeppost00;

*Look at controls;
twoway scatter resid lnpop;
twoway scatter resid pctblk1990;
twoway scatter resid pctunivp1990;
twoway scatter resid pctHSp1990; //note that HS and university are not mutually-exclusive groups
twoway scatter resid pctbelowPL1990;
twoway scatter resid medhhinc1990; //note that this is correlated with percent below the poverty line
twoway scatter resid carnegie1_enr;
twoway scatter resid frac_in_eng_prog;
twoway scatter resid npatent1980s;
twoway scatter resid frprof; //note that this is correlated with education
twoway scatter resid pct65p1990;
twoway scatter resid netmig95;
twoway scatter resid change_totalpop;
twoway scatter resid change_pctblk;
twoway scatter resid change_pctunivp;
twoway scatter resid change_pctHSp;
twoway scatter resid change_pct65;
twoway scatter resid change_netmig;*/

*Some potential problems with multicolinearity;
corr pctblk1990 change_pctblk pctunivp1990 change_pctunivp pctHSp1990 change_pctHSp pct65p1990 change_pct65 netmig95 change_netmig lnpop change_totalpop;
*The education levels and their changes are highly correlated, same for net migration;

corr medhhinc1990 pctbelowPL1990;
corr frprof pctunivp1990 pctHSp1990;
*drop professional careers, percent below the poverty line;

********************************************************************************;
*FINAL SPECIFICATIONS, ALTERED CONTROLS                                   *****;
********************************************************************************;
*NOTE: Forman et al.'s change variables span 1990-2000, while the internet investment variable spans 1995-2000;
*There is potentially a problem with post-treatment bias, since investment could spawn education, migration;
*To make our results more comparable with Forman et al. we retain their change variables;
*But note that the treatment effect is potentially biased. Future research should investigate this;

global controls_n medhhinc1990 pctblk1990 pctunivp1990 pctHSp1990 pct65p1990 carnegie1_enr frac_in_eng_prog npatent1980s netmig95;
global change_n change_totalpop change_pctblk change_pctunivp change_pctHSp change_pct65 change_netmig;

*Average weekly wages, OLS, no interactions;
reg wagediff surv_deeppost00 indivhomeinternet00_cty tech95 $controls_n $change_n, robust; 
outreg2 using `logfolder'ReplicateOLS, replace;

*Average weekly wages, OLS, interaction;
reg wagediff surv_deeppost00 allhigh_surv_deeppost00 allhigh2 indivhomeinternet00_cty tech95 $controls_n $change_n, robust;
outreg2 using `logfolder'ReplicateOLS, append tex;

*Average weekly wages, LIML, no interactions;
ivregress liml wagediff (surv_deeppost00=iv_othcprogrammerswt) indivhomeinternet00_cty tech95 $controls_n $change_n, first vce(robust);
outreg2 using `logfolder'ReplicateLIML, replace;

*Average weekly wages, LIML, interactions;
ivregress liml wagediff (surv_deeppost00 allhigh_surv_deeppost00 =iv_othcprogrammerswt ivd_othcp2)  allhigh2 indivhomeinternet00_cty tech95 $controls_n $change_n, first vce(robust);
outreg2 using `logfolder'ReplicateLIML, append tex;

*Summary statistics;
sum wagediff surv_deeppost00 indivhomeinternet00_cty tech95 $controls_n $change_n if e(sample)==1;

*First Stage;
reg surv_deeppost00 iv_othcprogrammerswt indivhomeinternet00_cty tech95 $controls_n $change_n, robust;
outreg2 using `logfolder'FirstStage, replace tex;

********************************************************************************;
* CREATE THE REPLICATION DATASET                                           *****;
********************************************************************************;

drop yhat resid _merge;
save QuinbyShanker_Replication.dta, replace;
